rm(list = ls(all = TRUE))
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  555049 29.7    1242625 66.4   686457 36.7
## Vcells 1018217  7.8    8388608 64.0  1876634 14.4
library(magrittr)
library(data.table)
library(knitr)

`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

1 Ath gmm

  • lowercase, protein based IDs!
# fp = file.path('..', 'input', 'ath-annot')
fp = file.path('..', 'input', 'Mercator')
# fn = 'ath_Araport11_2018-05-25_mapping.txt.gz'
# fn = 'X4.4_Arabidopsis_thaliana.txt'
fn = 'ath_Mercator4v7_results.txt'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

combined$IDENTIFIER = sub("\\'", '', sub("\\..*$", "", toupper(combined$IDENTIFIER)))
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)
colnames(combined)[2:4] = paste('ath', colnames(combined)[2:4], sep = '_')

ath.gmm = combined

2 Ath SKM & annotation

note: some duplicated ids in PSS

fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12', 
               'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]


fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
  # Remove NA and empty strings
  x = x[!is.na(x) & x != ""]
  paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]


fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]

pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
  pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]

pss_long = melt(
  pss,
  id.vars = c("name", "all_pathways", 'short_name'),       # Columns to keep as is
  measure.vars = patterns("^id_"),           # Columns to melt (all starting with "id_")
  variable.name = "id_num",                   # Name for the melted variable column
  value.name = "id"                           # Name for the melted value column
)

pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))
## 
## FALSE  TRUE 
##   816    24
pss_long %>%
  dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
  dplyr::arrange(id) %>%
  print()
##                                              all_pathways short_name        id
##                                                    <char>     <char>    <char>
##  1:                               Hormone - Ethylene (ET)      ORA59 AT1G06160
##  2:                               Hormone - Ethylene (ET)  ERF/ORA59 AT1G06160
##  3:                         Hormone - Salicylic acid (SA)       TCP8 AT1G58100
##  4:                         Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
##  5:                               Hormone - Ethylene (ET)       EDF2 AT1G68840
##  6:                               Hormone - Ethylene (ET)    ERF/EDF AT1G68840
##  7: Signalling - Heat-shock proteins (HSPs),Stress - Heat      HSP70 AT3G12580
##  8:               Signalling - Heat-shock proteins (HSPs)        HSP AT3G12580
##  9:                               Hormone - Ethylene (ET)       ERF1 AT3G23240
## 10:                               Hormone - Ethylene (ET)    ERF/EDF AT3G23240
## 11:                               Hormone - Ethylene (ET)       ERF6 AT4G17490
## 12:                               Hormone - Ethylene (ET)  ERF/ORA59 AT4G17490
## 13:                               Hormone - Ethylene (ET)       ERF1 AT4G17500
## 14:                               Hormone - Ethylene (ET)    ERF/EDF AT4G17500
## 15:               Signalling - Heat-shock proteins (HSPs)     MED37E AT5G02500
## 16:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G02500
## 17:                               Hormone - Ethylene (ET)     ERF096 AT5G43410
## 18:                               Hormone - Ethylene (ET)    ERF/EDF AT5G43410
## 19:                               Hormone - Ethylene (ET)       ERF5 AT5G47230
## 20:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G47230
## 21:                               Hormone - Ethylene (ET)     ERF105 AT5G51190
## 22:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G51190
## 23:               Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G59720
## 25:                               Hormone - Ethylene (ET)     ERF104 AT5G61600
## 26:                               Hormone - Ethylene (ET)    ERF/EDF AT5G61600
##                                              all_pathways short_name        id
pss_long = pss_long %>%
  dplyr::filter(stringr::str_starts(id, "AT")) %>%
  dplyr::group_by(id) %>%
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::everything(),
      .fns = ~ {
        vals = unique(na.omit(.))
        if (length(vals) > 1) paste(vals, collapse = " | ")
        else if (length(vals) == 1) vals
        else NA_character_
      }
    ),
    .groups = "drop"
  )

3 Abbreviations

Plant Name Label JCVI-MCScan Compara Plants Plaza OrthoDB FastOma RBH Mercator
Malus domestica apple mdo_GDv1 malus_domestica_golden mdo mdo mdo mdo mdo
mdo_HChap1
Prunus persica ppe ppe prunus_persica ppe ppe pper ppe pper
Prunus dulcis / P. amygdalus almond almond prunus_dulcis pdul pdul pdul pdul
Prunus avium wild cherry wildcherry prunus_avium pavi pavi pavi pavi
Prunus armeniaca apricot apricot parm parm parm parm
Prunus cerasifera cherry plum cherryplum pcer pcer pcer
Pyrus pear pear pcox pcox pcox
Prunus sibirica Siberian apricot siberianapricot psib psib psib
# in OrthoDB and PLAZA

# mdo_HChap1/mdo/mdo
# ppe/ppe/pper


# in OrthoDB, no PLAZA
# pdul/pdul/pdul
# wildcherry/pavi/pavi
# apricot/parm/parm



# not in OrthoDB or PLAZA
# pear/pcox/pcox
# cherryplum/pcer/pcer
# siberianapricot/psib/psib
params_list <- list(

  plantName1 = 'mdo'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'malus_domestica_golden'
  , # change name - compara # sources
  plantName3 = 'mdo_HChap1'
  ,  # change name - MCScanX # sources
  plantName4 = 'mdo'
  ,  # change name - FastOMA # sources
  
  plantDirIn = "mdo_apple"
  ,# inconsistent-IDs, orthofinder
  plantNameOut = "apple"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "apple")
  ,
  
  pattern_in = "\\.[^.]*$"
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  compara_pattern_in1 = '\\..*'
  ,
  compara_pattern_out1 = ""
  ,
  compara_pattern_in2 = ".*_"
  ,
  compara_pattern_out2 = ""
  ,
  plaza_pattern_in1 = '\\..*'
  ,
  plaza_pattern_in2 = ".* "
  ,
  
  
  ref_genome = "g.Honeycrisp_HAP1_braker1+2_combined_fullSupport_longname_filtered.pep"
  , # inconsistent-IDs
  
  mercator = 'mdo_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "a.g"
  , # plant-gmm
  mercatorPatternOut2 = "A.g"
  , # plant-gmm
  flag1 = 1
  ,
  flag2 = 1
  ,
  flag3 = FALSE

)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000024eb2a4aaf0>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-10] | |….. | 9% | |…… | 12% [unnamed-chunk-11] | |…….. | 15% | |……… | 18% [unnamed-chunk-12] | |……….. | 21% | |………… | 24% [unnamed-chunk-13] | |………….. | 27% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-15] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-16] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-17] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-18] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-19] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-20] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-21] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-22] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-23] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-24] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-25] | |……………………………………………| 100%

cat(child_content)

4 Subsection: mdo

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

4.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           22396           79898           60472           56069           33104 
##             RBH 
##           37682
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID                   to_protID          source
##    <chr>       <chr>       <chr>                       <chr>              <chr> 
##  1 <NA>        AT4G10330.1 <NA>                        Maldo.hc.v1a1.ch1… FastO…
##  2 <NA>        AT4G10330.1 <NA>                        Maldo.hc.v1a1.ch1… FastO…
##  3 <NA>        AT5G44410.1 <NA>                        g1.t1              FastO…
##  4 <NA>        AT5G44440.2 <NA>                        g1.t1              FastO…
##  5 <NA>        AT1G71250.1 <NA>                        Maldo.hc.v1a1.ch1… MCSca…
##  6 <NA>        AT1G71250.1 <NA>                        Maldo.hc.v1a1.ch1… MCSca…
##  7 <NA>        AT2G07655.1 <NA>                        Maldo.hc.v1a1.sc4… MCSca…
##  8 <NA>        AT2G07655.1 <NA>                        Maldo.hc.v1a1.sc4… MCSca…
##  9 AT2G46320   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 10 AT4G27940   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 11 AT2G07695   <NA>        Maldo.hc.v1a1.sc164A.g48922 <NA>               Ortho…
## 12 AT2G07695   <NA>        Maldo.hc.v1a1.sc119A.g48697 <NA>               Ortho…
## 13 AT1G01020   <NA>        MD01G1091900                <NA>               PLAZA 
## 14 AT1G01020   <NA>        MD07G1162500                <NA>               PLAZA 
## 15 AT1G16360   <NA>        MD17G1286800                <NA>               PLAZA 
## 16 AT1G79450   <NA>        MD17G1286800                <NA>               PLAZA 
## 17 AT1G01020   <NA>        Maldo.hc.v1a1.ch7A.g41990   <NA>               RBH   
## 18 AT1G01030   <NA>        Maldo.hc.v1a1.ch1A.g25187   <NA>               RBH   
## 19 ATMG01410   <NA>        Maldo.hc.v1a1.sc36A.g49760  <NA>               RBH   
## 20 ATMG01410   <NA>        Maldo.hc.v1a1.sc71A.g49908  <NA>               RBH   
## 21 AT1G01020   AT1G01020.1 MD01G0070900                mRNA:MD01G0070900  ensem…
## 22 AT1G01050   AT1G01050.1 MD07G0133100                mRNA:MD07G0133100  ensem…
## 23 AT5G67630   AT5G67630.1 MD02G0158200                mRNA:MD02G0158200  ensem…
## 24 AT5G67630   AT5G67630.1 MD15G0258100                mRNA:MD15G0258100  ensem…

4.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##      12
print(df[ind, ])
##         from_geneID     from_protID to_geneID                     to_protID
##              <char>          <char>    <char>                        <char>
##  1: AT1G25470.uORF1 AT1G25470.uORF1      <NA> Maldo.hc.v1a1.ch13A.g09400.t1
##  2: AT1G68550.uORF1 AT1G68550.uORF1      <NA> Maldo.hc.v1a1.ch13A.g09400.t1
##  3: AT4G30960.uORF1 AT4G30960.uORF1      <NA> Maldo.hc.v1a1.ch15A.g15777.t1
##  4: AT1G58120.uORF1 AT1G58120.uORF1      <NA> Maldo.hc.v1a1.ch15A.g18095.t1
##  5: AT1G68550.uORF1 AT1G68550.uORF1      <NA> Maldo.hc.v1a1.ch16A.g19057.t1
##  6: AT4G25670.uORF1 AT4G25670.uORF1      <NA>  Maldo.hc.v1a1.ch1A.g26199.t1
##  7: AT4G25690.uORF1 AT4G25690.uORF1      <NA>  Maldo.hc.v1a1.ch1A.g26199.t1
##  8: AT5G52550.uORF1 AT5G52550.uORF1      <NA>  Maldo.hc.v1a1.ch1A.g26199.t1
##  9: AT4G30960.uORF1 AT4G30960.uORF1      <NA>  Maldo.hc.v1a1.ch2A.g26596.t1
## 10: AT4G25670.uORF1 AT4G25670.uORF1      <NA>  Maldo.hc.v1a1.ch7A.g43057.t1
## 11: AT4G25690.uORF1 AT4G25690.uORF1      <NA>  Maldo.hc.v1a1.ch7A.g43057.t1
## 12: AT5G52550.uORF1 AT5G52550.uORF1      <NA>  Maldo.hc.v1a1.ch7A.g43057.t1
##      source
##      <char>
##  1: MCScanX
##  2: MCScanX
##  3: MCScanX
##  4: MCScanX
##  5: MCScanX
##  6: MCScanX
##  7: MCScanX
##  8: MCScanX
##  9: MCScanX
## 10: MCScanX
## 11: MCScanX
## 12: MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID                   to_protID          source
##    <chr>       <chr>       <chr>                       <chr>              <chr> 
##  1 AT4G10330   AT4G10330.1 Maldo.hc.v1a1.ch10A.g00003  Maldo.hc.v1a1.ch1… FastO…
##  2 AT4G10330   AT4G10330.1 Maldo.hc.v1a1.ch10A.g00003  Maldo.hc.v1a1.ch1… FastO…
##  3 AT5G44410   AT5G44410.1 g1                          g1.t1              FastO…
##  4 AT5G44440   AT5G44440.2 g1                          g1.t1              FastO…
##  5 AT1G71250   AT1G71250.1 Maldo.hc.v1a1.ch10A.g00019  Maldo.hc.v1a1.ch1… MCSca…
##  6 AT1G71250   AT1G71250.1 Maldo.hc.v1a1.ch10A.g00019  Maldo.hc.v1a1.ch1… MCSca…
##  7 AT2G07655   AT2G07655.1 Maldo.hc.v1a1.sc45A.g49893  Maldo.hc.v1a1.sc4… MCSca…
##  8 AT2G07655   AT2G07655.1 Maldo.hc.v1a1.sc45A.g49893  Maldo.hc.v1a1.sc4… MCSca…
##  9 AT2G46320   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 10 AT4G27940   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 11 AT2G07695   <NA>        Maldo.hc.v1a1.sc164A.g48922 <NA>               Ortho…
## 12 AT2G07695   <NA>        Maldo.hc.v1a1.sc119A.g48697 <NA>               Ortho…
## 13 AT1G01020   <NA>        MD01G1091900                <NA>               PLAZA 
## 14 AT1G01020   <NA>        MD07G1162500                <NA>               PLAZA 
## 15 AT1G16360   <NA>        MD17G1286800                <NA>               PLAZA 
## 16 AT1G79450   <NA>        MD17G1286800                <NA>               PLAZA 
## 17 AT1G01020   <NA>        Maldo.hc.v1a1.ch7A.g41990   <NA>               RBH   
## 18 AT1G01030   <NA>        Maldo.hc.v1a1.ch1A.g25187   <NA>               RBH   
## 19 ATMG01410   <NA>        Maldo.hc.v1a1.sc36A.g49760  <NA>               RBH   
## 20 ATMG01410   <NA>        Maldo.hc.v1a1.sc71A.g49908  <NA>               RBH   
## 21 AT1G01020   AT1G01020.1 MD01G0070900                mRNA:MD01G0070900  ensem…
## 22 AT1G01050   AT1G01050.1 MD07G0133100                mRNA:MD07G0133100  ensem…
## 23 AT5G67630   AT5G67630.1 MD02G0158200                mRNA:MD02G0158200  ensem…
## 24 AT5G67630   AT5G67630.1 MD15G0258100                mRNA:MD15G0258100  ensem…
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        56069
## 5:           PLAZA            0        33104
## 6:             RBH            0        37682

4.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
  plaza = data.table::fread(file.path(fp, fn))
} else {
  plaza = data.frame(matrix(ncol = 4, nrow = 0))
}


compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name


colnames(compara)[3] = colnames(plaza)[3] = 'source'


compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
  # Cartesian join of OrthoDB_list and Orthologs_list for this row
  pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
  setnames(pairs, c("OrthoDB_ID", "Ortholog"))
  pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                         sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)
##      OrthoDB_ID                      Ortholog
##          <char>                        <char>
## 1: MD07G0132900 Maldo.hc.v1a1.ch16A.g19172.t1
## 2: MD07G0132900 Maldo.hc.v1a1.ch11A.g06055.t1
## 3: MD07G0132900 Maldo.hc.v1a1.ch11A.g06055.t2
## 4: MD07G0132900  Maldo.hc.v1a1.ch8A.g43704.t1
## 5: MD07G0132900 Maldo.hc.v1a1.ch14A.g12608.t1
## 6: MD07G0132900  Maldo.hc.v1a1.ch4A.g33125.t1
if (nrow(plaza) != 0) {
  plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
  plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
  result = plaza[, {
    # Cartesian join of OrthoDB_list and Orthologs_list for this row
    pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
    setnames(pairs, c("OrthoDB_ID", "Ortholog"))
    pairs
  }, by = seq_len(nrow(plaza))]
  plaza = result[, seq_len := NULL]
  # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
  plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
  plaza = plaza[!duplicated(plaza), ]
  head(plaza)  
}
##      OrthoDB_ID                      Ortholog
##          <char>                        <char>
## 1: MD15G1256000 Maldo.hc.v1a1.ch16A.g19172.t1
## 2: MD15G1256000 Maldo.hc.v1a1.ch11A.g06055.t1
## 3: MD15G1256000 Maldo.hc.v1a1.ch11A.g06055.t2
## 4: MD15G1256000  Maldo.hc.v1a1.ch8A.g43704.t1
## 5: MD15G1256000 Maldo.hc.v1a1.ch14A.g12608.t1
## 6: MD15G1256000  Maldo.hc.v1a1.ch4A.g33125.t1
if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)

if (flag2 == 1) { # geneID and prot ID are completely different # make flags
  df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)  
} else {
    df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog) 
}

df_compara = df_compara[!is.na(df_compara$to_geneID), ]


if (nrow(plaza) != 0) {
  df_plaza = dplyr::filter(df, source == "PLAZA") %>%
    dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)
  df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}

if (nrow(plaza) != 0) {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
  dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
  dt = dplyr::bind_rows(df_compara, df_other)
}

ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]


if (nrow(plaza) != 0) {
  ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
  ind = which(df$source %in% c('ensembl-compara'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}





df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 3
##    from_geneID to_geneID                   source         
##    <chr>       <chr>                       <chr>          
##  1 AT4G10330   Maldo.hc.v1a1.ch10A.g00003  FastOMA        
##  2 AT4G10340   Maldo.hc.v1a1.ch10A.g00004  FastOMA        
##  3 AT5G44410   g1                          FastOMA        
##  4 AT5G44440   g1                          FastOMA        
##  5 AT1G71250   Maldo.hc.v1a1.ch10A.g00019  MCScanX        
##  6 AT4G10170   Maldo.hc.v1a1.ch10A.g00024  MCScanX        
##  7 ATMG01320   Maldo.hc.v1a1.sc45A.g49892  MCScanX        
##  8 AT2G07655   Maldo.hc.v1a1.sc45A.g49893  MCScanX        
##  9 AT2G46320   Maldo.hc.v1a1.ch1A.g26266   OrthoDB        
## 10 AT4G27940   Maldo.hc.v1a1.ch1A.g26266   OrthoDB        
## 11 AT2G07695   Maldo.hc.v1a1.sc164A.g48922 OrthoDB        
## 12 AT2G07695   Maldo.hc.v1a1.sc119A.g48697 OrthoDB        
## 13 AT1G01020   Maldo.hc.v1a1.ch1A.g25188   PLAZA          
## 14 AT1G01020   Maldo.hc.v1a1.ch7A.g41989   PLAZA          
## 15 AT1G16360   Maldo.hc.v1a1.ch17A.g24061  PLAZA          
## 16 AT1G79450   Maldo.hc.v1a1.ch17A.g24061  PLAZA          
## 17 AT1G01020   Maldo.hc.v1a1.ch7A.g41990   RBH            
## 18 AT1G01030   Maldo.hc.v1a1.ch1A.g25187   RBH            
## 19 ATMG01410   Maldo.hc.v1a1.sc36A.g49760  RBH            
## 20 ATMG01410   Maldo.hc.v1a1.sc71A.g49908  RBH            
## 21 AT1G01020   Maldo.hc.v1a1.ch1A.g25188   ensembl-compara
## 22 AT1G01050   Maldo.hc.v1a1.ch1A.g25184   ensembl-compara
## 23 AT5G67630   Maldo.hc.v1a1.ch2A.g27934   ensembl-compara
## 24 AT5G67630   Maldo.hc.v1a1.ch15A.g17150  ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1091349 58.3    2079652 111.1  2079652 111.1
## Vcells 4849864 37.1   16015626 122.2 16007166 122.2
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

4.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23053
length(unique(dt$to_geneID))
## [1] 34410
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           20997           77128           32045           56069           30068 
##             RBH 
##           37682
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

4.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

4.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##         from_geneID                  to_geneID FastOMA MCScanX OrthoDB  PLAZA
##              <char>                     <char>  <lgcl>  <lgcl>  <lgcl> <lgcl>
##  1: AT1G25470.uORF1 Maldo.hc.v1a1.ch13A.g09400   FALSE    TRUE   FALSE  FALSE
##  2: AT1G58120.uORF1 Maldo.hc.v1a1.ch15A.g18095   FALSE    TRUE   FALSE  FALSE
##  3: AT1G68550.uORF1 Maldo.hc.v1a1.ch13A.g09400   FALSE    TRUE   FALSE  FALSE
##  4: AT1G68550.uORF1 Maldo.hc.v1a1.ch16A.g19057   FALSE    TRUE   FALSE  FALSE
##  5: AT4G25670.uORF1  Maldo.hc.v1a1.ch1A.g26199   FALSE    TRUE   FALSE  FALSE
##  6: AT4G25670.uORF1  Maldo.hc.v1a1.ch7A.g43057   FALSE    TRUE   FALSE  FALSE
##  7: AT4G25690.uORF1  Maldo.hc.v1a1.ch1A.g26199   FALSE    TRUE   FALSE  FALSE
##  8: AT4G25690.uORF1  Maldo.hc.v1a1.ch7A.g43057   FALSE    TRUE   FALSE  FALSE
##  9: AT4G30960.uORF1 Maldo.hc.v1a1.ch15A.g15777   FALSE    TRUE   FALSE  FALSE
## 10: AT4G30960.uORF1  Maldo.hc.v1a1.ch2A.g26596   FALSE    TRUE   FALSE  FALSE
## 11: AT5G52550.uORF1  Maldo.hc.v1a1.ch1A.g26199   FALSE    TRUE   FALSE  FALSE
## 12: AT5G52550.uORF1  Maldo.hc.v1a1.ch7A.g43057   FALSE    TRUE   FALSE  FALSE
##        RBH ensembl-compara count_evidence
##     <lgcl>          <lgcl>          <num>
##  1:  FALSE           FALSE              1
##  2:  FALSE           FALSE              1
##  3:  FALSE           FALSE              1
##  4:  FALSE           FALSE              1
##  5:  FALSE           FALSE              1
##  6:  FALSE           FALSE              1
##  7:  FALSE           FALSE              1
##  8:  FALSE           FALSE              1
##  9:  FALSE           FALSE              1
## 10:  FALSE           FALSE              1
## 11:  FALSE           FALSE              1
## 12:  FALSE           FALSE              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

4.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

4.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

4.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 6d 61 6c 64 6f 2e 68 63 2e 76 31 61 31 2e 63 68 33 61 2e 67 33 31 32 39
## [26] 31 2e 74 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 6d 61 6c 64 6f 2e 68 63 2e 76 31 61 31 2e 63 68 33 61 2e 67 33 31 32 39 31
## [26] 2e 74 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 14272 35833
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "ensembl-compara"
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
##  FALSE   TRUE 
## 122701     27
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##  [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   6626957 354.0   11756942  627.9  11756942  627.9
## Vcells 112261222 856.5  190630912 1454.4 158751041 1211.2
library(magrittr)
library(ggplot2)
library(ComplexUpset)

4.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

4.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23046
length(unique(df$to_geneID))
## [1] 34408
range(df$from_count)
## [1]   1 128
range(df$to_count)
## [1]   1 116
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 319
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 288
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          95834          16984           4836
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       47777    14508           4314
##   2       13486     1598            273
##   3        8903      517             98
##   4        8949      203             63
##   5       10121      123             56
##   6        6598       35             32
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")



# Initialize an empty set or list covered_genes.
# 
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
# 
# For every row in the data table dt:
# 
# a. Check if filter_criteria is "reject" for the row.
# 
# b. Check if the value of the column corresponding to method in this row is TRUE.
# 
# c. Check if from_geneID in this row is not in covered_genes.
# 
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
      # Check if gene pair is covered by all three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = "OrthoDB_FastOMA_RBH"
      # Else if gene pair is covered by any two of those three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
      # Else if MapMan4_Match string contains "match based on" and current method name:
      # is_candidate = TRUE
      # new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
      # Else:
      # is_candidate = FALSE


special_methods = c("OrthoDB", "RBH", "FastOMA")

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {
  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            5183            1882            4015
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                5281                4015                1242                1072 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               32088                 579                5183                 670 
##               PLAZA         RBH_MapMan4              reject 
##                6577                1882               59065
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

4.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          53789           3695           1105
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       14991     1733            738
##   2        7682     1198            145
##   3        6646      416             80
##   4        8004      191             59
##   5        9868      122             51
##   6        6598       35             32
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 21121
length(unique(kept$to_geneID))
## [1] 33249
range(kept$from_count)
## [1]  1 93
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 32
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 28
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

4.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 163                  83                  71                  35 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                1337                  26                 122                  30 
##               PLAZA         RBH_MapMan4              reject 
##                 264                  29                1788
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName1 = 'ppe', # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'prunus_persica', # change name - compara # sources
  plantName3 = 'ppe',  # change name - MCScanX # sources
  plantName4 = 'pper',  # change name - FastOMA # sources
  
  plantDirIn = "ppe_peach", # inconsistent-IDs, orthofinder
  plantNameOut = "peach",
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "peach"),

  pattern_in = "\\.[^.]*$", # everythin after the last dot
  pattern_out = "", # all-IDs
  compara_pattern_in1 = "",
  compara_pattern_out1 = "",
  compara_pattern_in2 = "",
  compara_pattern_out2 = "",
  plaza_pattern_in1 = "",
  plaza_pattern_in2 = "",
  
  ref_genome = "PLAZA_proteome.selected_transcript.ppe", # inconsistent-IDs, orthofinder for OrthoDB
  
  mercator = 'pper_Mercator4v7_results.txt', # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']", # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = "", # plant-gmm
  mercatorPatternIn2 = "g", # plant-gmm
  mercatorPatternOut2 = "G", # plant-gmm
  flag1 = 1,
  flag2 = 2,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000024f4761edd0>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-44] | |….. | 9% | |…… | 12% [unnamed-chunk-45] | |…….. | 15% | |……… | 18% [unnamed-chunk-46] | |……….. | 21% | |………… | 24% [unnamed-chunk-47] | |………….. | 27% | |…………… | 30% [unnamed-chunk-48] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-49] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-50] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-51] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-52] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-53] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-54] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-55] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-56] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-57] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-58] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-59] | |……………………………………………| 100%

cat(child_content)

5 Subsection: ppe

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

5.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           16851           44006           62733           38370           20774 
##             RBH 
##           24564
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID      to_protID        source         
##    <chr>       <chr>       <chr>          <chr>            <chr>          
##  1 <NA>        AT1G12040.1 <NA>           Prupe.1G000500.1 FastOMA        
##  2 <NA>        AT1G62440.1 <NA>           Prupe.1G000500.1 FastOMA        
##  3 <NA>        AT1G61010.3 <NA>           Prupe.I006100.1  FastOMA        
##  4 <NA>        AT1G61010.3 <NA>           Prupe.I006200.1  FastOMA        
##  5 <NA>        AT5G58130.1 <NA>           Prupe.1G000700.1 MCScanX        
##  6 <NA>        AT5G58110.1 <NA>           Prupe.1G000800.1 MCScanX        
##  7 <NA>        AT4G02060.1 <NA>           Prupe.8G272000.1 MCScanX        
##  8 <NA>        AT4G02060.2 <NA>           Prupe.8G272000.1 MCScanX        
##  9 AT3G17900   <NA>        Prupe.1G267800 <NA>             OrthoDB        
## 10 AT4G35230   <NA>        Prupe.1G355500 <NA>             OrthoDB        
## 11 AT2G07675   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 12 ATMG00980   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 13 AT1G01020   <NA>        Prupe.2G201100 <NA>             PLAZA          
## 14 AT1G01050   <NA>        Prupe.2G200700 <NA>             PLAZA          
## 15 AT1G52360   <NA>        Prupe.I003200  <NA>             PLAZA          
## 16 AT5G40850   <NA>        Prupe.I005100  <NA>             PLAZA          
## 17 AT1G01030   <NA>        Prupe.5G134900 <NA>             RBH            
## 18 AT1G01040   <NA>        Prupe.2G200900 <NA>             RBH            
## 19 ATMG01250   <NA>        Prupe.6G123900 <NA>             RBH            
## 20 ATMG01250   <NA>        Prupe.7G164000 <NA>             RBH            
## 21 AT1G01020   AT1G01020.1 PRUPE_2G201100 ONI23664         ensembl-compara
## 22 AT1G01040   AT1G01040.2 PRUPE_2G200900 ONI23660         ensembl-compara
## 23 AT5G67620   AT5G67620.1 PRUPE_6G219300 ONI02738         ensembl-compara
## 24 AT5G67630   AT5G67630.1 PRUPE_1G544700 ONI35595         ensembl-compara

5.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##      23
print(df[ind, ])
##         from_geneID     from_protID to_geneID        to_protID  source
##              <char>          <char>    <char>           <char>  <char>
##  1: AT3G25570.uORF1 AT3G25570.uORF1      <NA> Prupe.1G299600.1 MCScanX
##  2: AT1G25470.uORF1 AT1G25470.uORF1      <NA> Prupe.1G310000.1 MCScanX
##  3: AT1G68550.uORF1 AT1G68550.uORF1      <NA> Prupe.1G310000.1 MCScanX
##  4: AT1G23150.uORF1 AT1G23150.uORF1      <NA> Prupe.1G329400.1 MCScanX
##  5: AT1G70780.uORF1 AT1G70780.uORF1      <NA> Prupe.1G329400.1 MCScanX
##  6: AT1G75390.uORF1 AT1G75390.uORF1      <NA> Prupe.1G374500.1 MCScanX
##  7: AT5G50010.uORF2 AT5G50010.uORF2      <NA> Prupe.1G527700.1 MCScanX
##  8: AT4G25670.uORF1 AT4G25670.uORF1      <NA> Prupe.2G300500.1 MCScanX
##  9: AT4G25690.uORF1 AT4G25690.uORF1      <NA> Prupe.2G300500.1 MCScanX
## 10: AT5G52550.uORF1 AT5G52550.uORF1      <NA> Prupe.2G300500.1 MCScanX
## 11: AT5G53590.uORF1 AT5G53590.uORF1      <NA> Prupe.2G317000.1 MCScanX
## 12: AT3G02470.uORF1 AT3G02470.uORF1      <NA> Prupe.3G243800.1 MCScanX
## 13: AT5G15950.uORF1 AT5G15950.uORF1      <NA> Prupe.3G243800.1 MCScanX
## 14: AT1G29950.uORF2 AT1G29950.uORF2      <NA> Prupe.4G077000.1 MCScanX
## 15: AT4G19110.uORF1 AT4G19110.uORF1      <NA> Prupe.5G021200.1 MCScanX
## 16: AT5G45430.uORF1 AT5G45430.uORF1      <NA> Prupe.5G021200.1 MCScanX
## 17: AT2G27230.uORF1 AT2G27230.uORF1      <NA> Prupe.6G144400.1 MCScanX
## 18: AT3G12010.uORF1 AT3G12010.uORF1      <NA> Prupe.7G082600.1 MCScanX
## 19: AT4G36990.uORF1 AT4G36990.uORF1      <NA> Prupe.7G133500.1 MCScanX
## 20: AT2G18160.uORF1 AT2G18160.uORF1      <NA> Prupe.7G160500.1 MCScanX
## 21: AT5G09460.uORF1 AT5G09460.uORF1      <NA> Prupe.8G067700.1 MCScanX
## 22: AT5G64340.uORF1 AT5G64340.uORF1      <NA> Prupe.8G067700.1 MCScanX
## 23: AT4G34590.uORF1 AT4G34590.uORF1      <NA> Prupe.8G091700.1 MCScanX
##         from_geneID     from_protID to_geneID        to_protID  source
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID      to_protID        source         
##    <chr>       <chr>       <chr>          <chr>            <chr>          
##  1 AT1G12040   AT1G12040.1 Prupe.1G000500 Prupe.1G000500.1 FastOMA        
##  2 AT1G62440   AT1G62440.1 Prupe.1G000500 Prupe.1G000500.1 FastOMA        
##  3 AT1G61010   AT1G61010.3 Prupe.I006100  Prupe.I006100.1  FastOMA        
##  4 AT1G61010   AT1G61010.3 Prupe.I006200  Prupe.I006200.1  FastOMA        
##  5 AT5G58130   AT5G58130.1 Prupe.1G000700 Prupe.1G000700.1 MCScanX        
##  6 AT5G58110   AT5G58110.1 Prupe.1G000800 Prupe.1G000800.1 MCScanX        
##  7 AT4G02060   AT4G02060.1 Prupe.8G272000 Prupe.8G272000.1 MCScanX        
##  8 AT4G02060   AT4G02060.2 Prupe.8G272000 Prupe.8G272000.1 MCScanX        
##  9 AT3G17900   <NA>        Prupe.1G267800 <NA>             OrthoDB        
## 10 AT4G35230   <NA>        Prupe.1G355500 <NA>             OrthoDB        
## 11 AT2G07675   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 12 ATMG00980   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 13 AT1G01020   <NA>        Prupe.2G201100 <NA>             PLAZA          
## 14 AT1G01050   <NA>        Prupe.2G200700 <NA>             PLAZA          
## 15 AT1G52360   <NA>        Prupe.I003200  <NA>             PLAZA          
## 16 AT5G40850   <NA>        Prupe.I005100  <NA>             PLAZA          
## 17 AT1G01030   <NA>        Prupe.5G134900 <NA>             RBH            
## 18 AT1G01040   <NA>        Prupe.2G200900 <NA>             RBH            
## 19 ATMG01250   <NA>        Prupe.6G123900 <NA>             RBH            
## 20 ATMG01250   <NA>        Prupe.7G164000 <NA>             RBH            
## 21 AT1G01020   AT1G01020.1 PRUPE_2G201100 ONI23664         ensembl-compara
## 22 AT1G01040   AT1G01040.2 PRUPE_2G200900 ONI23660         ensembl-compara
## 23 AT5G67620   AT5G67620.1 PRUPE_6G219300 ONI02738         ensembl-compara
## 24 AT5G67630   AT5G67630.1 PRUPE_1G544700 ONI35595         ensembl-compara
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        38370
## 5:           PLAZA            0        20774
## 6:             RBH            0        24564

5.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
  plaza = data.table::fread(file.path(fp, fn))
} else {
  plaza = data.frame(matrix(ncol = 4, nrow = 0))
}


compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name


colnames(compara)[3] = colnames(plaza)[3] = 'source'


compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
  # Cartesian join of OrthoDB_list and Orthologs_list for this row
  pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
  setnames(pairs, c("OrthoDB_ID", "Ortholog"))
  pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                         sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)
##    OrthoDB_ID         Ortholog
##        <char>           <char>
## 1:   ONH90473 Prupe.8G055800.1
## 2:   ONI29575 Prupe.1G202700.1
## 3:   ONH90342 Prupe.8G047800.1
## 4:   ONH90445 Prupe.8G054500.1
## 5:   ONI26493 Prupe.1G028600.3
## 6:   ONI21208 Prupe.2G053400.1
if (nrow(plaza) != 0) {
  plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
  plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
  result = plaza[, {
    # Cartesian join of OrthoDB_list and Orthologs_list for this row
    pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
    setnames(pairs, c("OrthoDB_ID", "Ortholog"))
    pairs
  }, by = seq_len(nrow(plaza))]
  plaza = result[, seq_len := NULL]
  # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
  plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
  plaza = plaza[!duplicated(plaza), ]
  head(plaza)  
}

if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)

if (flag2 == 1) { # geneID and prot ID are completely different # make flags
  df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)  
} else {
    df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog) 
}

df_compara = df_compara[!is.na(df_compara$to_geneID), ]


if (nrow(plaza) != 0) {
  df_plaza = dplyr::filter(df, source == "PLAZA") %>%
    dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)
  df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}

if (nrow(plaza) != 0) {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
  dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
  dt = dplyr::bind_rows(df_compara, df_other)
}

ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]


if (nrow(plaza) != 0) {
  ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
  ind = which(df$source %in% c('ensembl-compara'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}





df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 3
##    from_geneID to_geneID      source         
##    <chr>       <chr>          <chr>          
##  1 AT1G12040   Prupe.1G000500 FastOMA        
##  2 AT1G62440   Prupe.1G000500 FastOMA        
##  3 AT1G61010   Prupe.I006100  FastOMA        
##  4 AT1G61010   Prupe.I006200  FastOMA        
##  5 AT5G58130   Prupe.1G000700 MCScanX        
##  6 AT5G58110   Prupe.1G000800 MCScanX        
##  7 AT3G62540   Prupe.8G271400 MCScanX        
##  8 AT4G02060   Prupe.8G272000 MCScanX        
##  9 AT3G17900   Prupe.1G267800 OrthoDB        
## 10 AT4G35230   Prupe.1G355500 OrthoDB        
## 11 AT2G07675   Prupe.6G146800 OrthoDB        
## 12 ATMG00980   Prupe.6G146800 OrthoDB        
## 13 AT1G01020   Prupe.2G201100 PLAZA          
## 14 AT1G01050   Prupe.2G200700 PLAZA          
## 15 AT1G52360   Prupe.I003200  PLAZA          
## 16 AT5G40850   Prupe.I005100  PLAZA          
## 17 AT1G01030   Prupe.5G134900 RBH            
## 18 AT1G01040   Prupe.2G200900 RBH            
## 19 ATMG01250   Prupe.6G123900 RBH            
## 20 ATMG01250   Prupe.7G164000 RBH            
## 21 AT1G01020   Prupe.2G201100 ensembl-compara
## 22 AT1G01040   Prupe.2G200900 ensembl-compara
## 23 AT5G67620   Prupe.6G219300 ensembl-compara
## 24 AT5G67630   Prupe.1G544700 ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4673117 249.6    9405554  502.4  11756942  627.9
## Vcells 120380685 918.5  190630912 1454.4 190628107 1454.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

5.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23136
length(unique(dt$to_geneID))
## [1] 21243
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           16193           44006           17894           38370           20774 
##             RBH 
##           24564
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

5.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

5.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##         from_geneID      to_geneID FastOMA MCScanX OrthoDB  PLAZA    RBH
##              <char>         <char>  <lgcl>  <lgcl>  <lgcl> <lgcl> <lgcl>
##  1: AT1G23150.uORF1 Prupe.1G329400   FALSE    TRUE   FALSE  FALSE  FALSE
##  2: AT1G25470.uORF1 Prupe.1G310000   FALSE    TRUE   FALSE  FALSE  FALSE
##  3: AT1G29950.uORF2 Prupe.4G077000   FALSE    TRUE   FALSE  FALSE  FALSE
##  4: AT1G68550.uORF1 Prupe.1G310000   FALSE    TRUE   FALSE  FALSE  FALSE
##  5: AT1G70780.uORF1 Prupe.1G329400   FALSE    TRUE   FALSE  FALSE  FALSE
##  6: AT1G75390.uORF1 Prupe.1G374500   FALSE    TRUE   FALSE  FALSE  FALSE
##  7: AT2G18160.uORF1 Prupe.7G160500   FALSE    TRUE   FALSE  FALSE  FALSE
##  8: AT2G27230.uORF1 Prupe.6G144400   FALSE    TRUE   FALSE  FALSE  FALSE
##  9: AT3G02470.uORF1 Prupe.3G243800   FALSE    TRUE   FALSE  FALSE  FALSE
## 10: AT3G12010.uORF1 Prupe.7G082600   FALSE    TRUE   FALSE  FALSE  FALSE
## 11: AT3G25570.uORF1 Prupe.1G299600   FALSE    TRUE   FALSE  FALSE  FALSE
## 12: AT4G19110.uORF1 Prupe.5G021200   FALSE    TRUE   FALSE  FALSE  FALSE
## 13: AT4G25670.uORF1 Prupe.2G300500   FALSE    TRUE   FALSE  FALSE  FALSE
## 14: AT4G25690.uORF1 Prupe.2G300500   FALSE    TRUE   FALSE  FALSE  FALSE
## 15: AT4G34590.uORF1 Prupe.8G091700   FALSE    TRUE   FALSE  FALSE  FALSE
## 16: AT4G36990.uORF1 Prupe.7G133500   FALSE    TRUE   FALSE  FALSE  FALSE
## 17: AT5G09460.uORF1 Prupe.8G067700   FALSE    TRUE   FALSE  FALSE  FALSE
## 18: AT5G15950.uORF1 Prupe.3G243800   FALSE    TRUE   FALSE  FALSE  FALSE
## 19: AT5G45430.uORF1 Prupe.5G021200   FALSE    TRUE   FALSE  FALSE  FALSE
## 20: AT5G50010.uORF2 Prupe.1G527700   FALSE    TRUE   FALSE  FALSE  FALSE
## 21: AT5G52550.uORF1 Prupe.2G300500   FALSE    TRUE   FALSE  FALSE  FALSE
## 22: AT5G53590.uORF1 Prupe.2G317000   FALSE    TRUE   FALSE  FALSE  FALSE
## 23: AT5G64340.uORF1 Prupe.8G067700   FALSE    TRUE   FALSE  FALSE  FALSE
##         from_geneID      to_geneID FastOMA MCScanX OrthoDB  PLAZA    RBH
##     ensembl-compara count_evidence
##              <lgcl>          <num>
##  1:           FALSE              1
##  2:           FALSE              1
##  3:           FALSE              1
##  4:           FALSE              1
##  5:           FALSE              1
##  6:           FALSE              1
##  7:           FALSE              1
##  8:           FALSE              1
##  9:           FALSE              1
## 10:           FALSE              1
## 11:           FALSE              1
## 12:           FALSE              1
## 13:           FALSE              1
## 14:           FALSE              1
## 15:           FALSE              1
## 16:           FALSE              1
## 17:           FALSE              1
## 18:           FALSE              1
## 19:           FALSE              1
## 20:           FALSE              1
## 21:           FALSE              1
## 22:           FALSE              1
## 23:           FALSE              1
##     ensembl-compara count_evidence
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

5.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

5.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

5.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 70 72 75 70 65 2e 33 67 30 30 34 31 30 30 2e 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 70 72 75 70 65 2e 33 67 30 30 34 31 30 30 2e 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
##  5670 21203
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "ensembl-compara"
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE  TRUE 
## 71187   231
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
##   [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
##   [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
##  [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
##  [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
##  [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
##  [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4819358 257.4    9405554  502.4  11756942  627.9
## Vcells 77105237 588.3  190630912 1454.4 190628107 1454.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

5.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

5.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23113
length(unique(df$to_geneID))
## [1] 21227
range(df$from_count)
## [1]  1 57
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 264
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 135
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          60256           8696           2466
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       28995     7390           2167
##   2        8716      797            143
##   3        5356      231             64
##   4        5513      140             40
##   5        6882       93             32
##   6        4794       45             20
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")



# Initialize an empty set or list covered_genes.
# 
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
# 
# For every row in the data table dt:
# 
# a. Check if filter_criteria is "reject" for the row.
# 
# b. Check if the value of the column corresponding to method in this row is TRUE.
# 
# c. Check if from_geneID in this row is not in covered_genes.
# 
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
      # Check if gene pair is covered by all three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = "OrthoDB_FastOMA_RBH"
      # Else if gene pair is covered by any two of those three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
      # Else if MapMan4_Match string contains "match based on" and current method name:
      # is_candidate = TRUE
      # new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
      # Else:
      # is_candidate = FALSE


special_methods = c("OrthoDB", "RBH", "FastOMA")

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {
  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            3923            1100            1947
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                5084                1947                1109                 231 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               17871                 415                3923                 516 
##               PLAZA         RBH_MapMan4              reject 
##                4912                1100               34310
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

5.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          34589           1921            598
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1        9932      949            397
##   2        4832      530             62
##   3        3682      179             49
##   4        4769      125             38
##   5        6580       93             32
##   6        4794       45             20
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20588
length(unique(kept$to_geneID))
## [1] 20794
range(kept$from_count)
## [1]  1 50
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 7
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 15
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

5.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 145                  38                  43                   7 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 728                  27                  51                  16 
##               PLAZA         RBH_MapMan4              reject 
##                 166                  19                1154
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'pdul'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'prunus_dulcis'
  , # change name - compara # sources
  plantName3 = 'almond'
  ,  # change name - MCScanX # sources
  plantName4 = 'pdul'
  ,  # change name - FastOMA # sources
  
  plantDirIn = "pdul_almond"
  , # inconsistent-IDs, orthofinder
  plantNameOut = "almond"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "almond")
  ,

  pattern_in = "\\.[^.]*$"
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  compara_pattern_in1 = ""
  ,
  compara_pattern_out1 = ""
  ,
  compara_pattern_in2 = ""
  ,
  compara_pattern_out2 = ""
  ,
  plaza_pattern_in1 = ""
  ,
  plaza_pattern_in2 = ""
  ,
  
  ref_genome = "Texas_F1_protein"
  , # inconsistent-IDs, orthofinder for OrthoDB
  
  mercator = 'pdul_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "([fg])"
  , # plant-gmm
  mercatorPatternOut2 = "\\U\\1" # plant-gmm
  ,
  flag1 = 2
  ,
  flag2 = 2
  ,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000024f76449350>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-78] | |….. | 9% | |…… | 12% [unnamed-chunk-79] | |…….. | 15% | |……… | 18% [unnamed-chunk-80] | |……….. | 21% | |………… | 24% [unnamed-chunk-81] | |………….. | 27% | |…………… | 30% [unnamed-chunk-82] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-83] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-84] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-85] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-86] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-87] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-88] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-89] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-90] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-91] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-92] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-93] | |……………………………………………| 100%

cat(child_content)

6 Subsection: pdul

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

6.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##           16421           42927           36681           35734           24829
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID       to_protID        source         
##    <chr>       <chr>       <chr>           <chr>            <chr>          
##  1 <NA>        AT2G05620.2 <NA>            TexasF1_G1000.1  FastOMA        
##  2 <NA>        AT3G48880.2 <NA>            TexasF1_G10001.1 FastOMA        
##  3 <NA>        AT3G48890.1 <NA>            TexasF1_G9999.1  FastOMA        
##  4 <NA>        AT5G52240.1 <NA>            TexasF1_G9999.1  FastOMA        
##  5 <NA>        AT5G22060.1 <NA>            TexasF1_G100.1   MCScanX        
##  6 <NA>        AT3G48880.1 <NA>            TexasF1_G10000.1 MCScanX        
##  7 <NA>        AT5G52240.1 <NA>            TexasF1_G9999.1  MCScanX        
##  8 <NA>        AT5G52240.2 <NA>            TexasF1_G9999.1  MCScanX        
##  9 AT1G23390   <NA>        TexasF1_G3359   <NA>             OrthoDB        
## 10 AT5G19210   <NA>        TexasF1_G2060   <NA>             OrthoDB        
## 11 AT3G51810   <NA>        TexasF1_G23162  <NA>             OrthoDB        
## 12 AT2G28815   <NA>        TexasF1_G6420   <NA>             OrthoDB        
## 13 AT1G01030   <NA>        TexasF1_G18833  <NA>             RBH            
## 14 AT1G01040   <NA>        TexasF1_G9057   <NA>             RBH            
## 15 ATMG00860   <NA>        TexasF1_G25095  <NA>             RBH            
## 16 ATMG01250   <NA>        TexasF1_G22012  <NA>             RBH            
## 17 AT1G01020   AT1G01020.1 Prudul26B025674 VVA35962         ensembl-compara
## 18 AT1G01040   AT1G01040.2 Prudul26B022109 VVA35960         ensembl-compara
## 19 AT5G67620   AT5G67620.1 Prudul26B020268 VVA23378         ensembl-compara
## 20 AT5G67630   AT5G67630.1 Prudul26B028327 VVA20286         ensembl-compara

6.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##       5
print(df[ind, ])
##        from_geneID     from_protID to_geneID        to_protID  source
##             <char>          <char>    <char>           <char>  <char>
## 1: AT4G19110.uORF1 AT4G19110.uORF1      <NA> TexasF1_G17548.1 MCScanX
## 2: AT5G45430.uORF1 AT5G45430.uORF1      <NA> TexasF1_G17548.1 MCScanX
## 3: AT1G48600.uORF1 AT1G48600.uORF1      <NA> TexasF1_G19862.1 MCScanX
## 4: AT1G06150.uORF1 AT1G06150.uORF1      <NA> TexasF1_G29714.1 MCScanX
## 5: AT2G31280.uORF1 AT2G31280.uORF1      <NA> TexasF1_G29714.1 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID       to_protID        source         
##    <chr>       <chr>       <chr>           <chr>            <chr>          
##  1 AT2G05620   AT2G05620.2 TexasF1_G1000   TexasF1_G1000.1  FastOMA        
##  2 AT3G48880   AT3G48880.2 TexasF1_G10001  TexasF1_G10001.1 FastOMA        
##  3 AT3G48890   AT3G48890.1 TexasF1_G9999   TexasF1_G9999.1  FastOMA        
##  4 AT5G52240   AT5G52240.1 TexasF1_G9999   TexasF1_G9999.1  FastOMA        
##  5 AT5G22060   AT5G22060.1 TexasF1_G100    TexasF1_G100.1   MCScanX        
##  6 AT3G48880   AT3G48880.1 TexasF1_G10000  TexasF1_G10000.1 MCScanX        
##  7 AT5G52240   AT5G52240.1 TexasF1_G9999   TexasF1_G9999.1  MCScanX        
##  8 AT5G52240   AT5G52240.2 TexasF1_G9999   TexasF1_G9999.1  MCScanX        
##  9 AT1G23390   <NA>        TexasF1_G3359   <NA>             OrthoDB        
## 10 AT5G19210   <NA>        TexasF1_G2060   <NA>             OrthoDB        
## 11 AT3G51810   <NA>        TexasF1_G23162  <NA>             OrthoDB        
## 12 AT2G28815   <NA>        TexasF1_G6420   <NA>             OrthoDB        
## 13 AT1G01030   <NA>        TexasF1_G18833  <NA>             RBH            
## 14 AT1G01040   <NA>        TexasF1_G9057   <NA>             RBH            
## 15 ATMG00860   <NA>        TexasF1_G25095  <NA>             RBH            
## 16 ATMG01250   <NA>        TexasF1_G22012  <NA>             RBH            
## 17 AT1G01020   AT1G01020.1 Prudul26B025674 VVA35962         ensembl-compara
## 18 AT1G01040   AT1G01040.2 Prudul26B022109 VVA35960         ensembl-compara
## 19 AT5G67620   AT5G67620.1 Prudul26B020268 VVA23378         ensembl-compara
## 20 AT5G67630   AT5G67630.1 Prudul26B028327 VVA20286         ensembl-compara
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        35734
## 5:             RBH            0        24829

6.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
  plaza = data.table::fread(file.path(fp, fn))
} else {
  plaza = data.frame(matrix(ncol = 4, nrow = 0))
}


compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name


colnames(compara)[3] = colnames(plaza)[3] = 'source'


compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
  # Cartesian join of OrthoDB_list and Orthologs_list for this row
  pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
  setnames(pairs, c("OrthoDB_ID", "Ortholog"))
  pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                         sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)
##    OrthoDB_ID         Ortholog
##        <char>           <char>
## 1:   VVA36572  TexasF1_G6946.1
## 2:   VVA37509  TexasF1_G1328.1
## 3:   VVA37509  TexasF1_G1832.1
## 4:   VVA37509 TexasF1_G24981.1
## 5:   VVA37509  TexasF1_G6849.1
## 6:   VVA37509 TexasF1_G16889.1
if (nrow(plaza) != 0) {
  plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
  plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
  result = plaza[, {
    # Cartesian join of OrthoDB_list and Orthologs_list for this row
    pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
    setnames(pairs, c("OrthoDB_ID", "Ortholog"))
    pairs
  }, by = seq_len(nrow(plaza))]
  plaza = result[, seq_len := NULL]
  # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
  plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
  plaza = plaza[!duplicated(plaza), ]
  head(plaza)  
}

if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)

if (flag2 == 1) { # geneID and prot ID are completely different # make flags
  df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)  
} else {
    df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog) 
}

df_compara = df_compara[!is.na(df_compara$to_geneID), ]


if (nrow(plaza) != 0) {
  df_plaza = dplyr::filter(df, source == "PLAZA") %>%
    dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)
  df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}

if (nrow(plaza) != 0) {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
  dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
  dt = dplyr::bind_rows(df_compara, df_other)
}

ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]


if (nrow(plaza) != 0) {
  ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
  ind = which(df$source %in% c('ensembl-compara'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}





df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 3
##    from_geneID to_geneID      source         
##    <chr>       <chr>          <chr>          
##  1 AT2G05620   TexasF1_G1000  FastOMA        
##  2 AT3G48880   TexasF1_G10001 FastOMA        
##  3 AT3G48890   TexasF1_G9999  FastOMA        
##  4 AT5G52240   TexasF1_G9999  FastOMA        
##  5 AT5G22060   TexasF1_G100   MCScanX        
##  6 AT3G48880   TexasF1_G10000 MCScanX        
##  7 AT3G48890   TexasF1_G9999  MCScanX        
##  8 AT5G52240   TexasF1_G9999  MCScanX        
##  9 AT1G23390   TexasF1_G3359  OrthoDB        
## 10 AT5G19210   TexasF1_G2060  OrthoDB        
## 11 AT3G51810   TexasF1_G23162 OrthoDB        
## 12 AT2G28815   TexasF1_G6420  OrthoDB        
## 13 AT1G01030   TexasF1_G18833 RBH            
## 14 AT1G01040   TexasF1_G9057  RBH            
## 15 ATMG00860   TexasF1_G25095 RBH            
## 16 ATMG01250   TexasF1_G22012 RBH            
## 17 AT1G01020   TexasF1_G9059  ensembl-compara
## 18 AT1G01040   TexasF1_G9057  ensembl-compara
## 19 AT5G67630   TexasF1_G6308  ensembl-compara
## 20 AT5G67630   TexasF1_G6731  ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3604227 192.5    9405554  502.4  11756942  627.9
## Vcells 81462298 621.6  190630912 1454.4 190628107 1454.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

6.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22456
length(unique(dt$to_geneID))
## [1] 20903
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##           15975           42927           20151           35734           24829
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

6.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

6.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##        from_geneID      to_geneID FastOMA MCScanX OrthoDB    RBH
##             <char>         <char>  <lgcl>  <lgcl>  <lgcl> <lgcl>
## 1: AT1G06150.uORF1 TexasF1_G29714   FALSE    TRUE   FALSE  FALSE
## 2: AT1G48600.uORF1 TexasF1_G19862   FALSE    TRUE   FALSE  FALSE
## 3: AT2G31280.uORF1 TexasF1_G29714   FALSE    TRUE   FALSE  FALSE
## 4: AT4G19110.uORF1 TexasF1_G17548   FALSE    TRUE   FALSE  FALSE
## 5: AT5G45430.uORF1 TexasF1_G17548   FALSE    TRUE   FALSE  FALSE
##    ensembl-compara count_evidence
##             <lgcl>          <num>
## 1:           FALSE              1
## 2:           FALSE              1
## 3:           FALSE              1
## 4:           FALSE              1
## 5:           FALSE              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

6.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

6.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

6.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 74 65 78 61 73 66 31 5f 67 31 30 34 30 31 2e 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 74 65 78 61 73 66 31 5f 67 31 30 34 30 31 2e 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
##  8713 20903
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "ensembl-compara" "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "athName"         "athSynonims"     "all_pathways"   
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 67028
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3140113 167.8    9405554  502.4  11756942  627.9
## Vcells 44057647 336.2  152504730 1163.6 190628107 1454.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

6.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

6.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22451
length(unique(df$to_geneID))
## [1] 20902
range(df$from_count)
## [1]   1 150
range(df$to_count)
## [1]   1 113
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 171
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 120
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          54247          10072           2709
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       26635     8113           2222
##   2        8151     1113            217
##   3        5577      378             94
##   4        6765      254             89
##   5        7119      214             87
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")



# Initialize an empty set or list covered_genes.
# 
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
# 
# For every row in the data table dt:
# 
# a. Check if filter_criteria is "reject" for the row.
# 
# b. Check if the value of the column corresponding to method in this row is TRUE.
# 
# c. Check if from_geneID in this row is not in covered_genes.
# 
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
      # Check if gene pair is covered by all three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = "OrthoDB_FastOMA_RBH"
      # Else if gene pair is covered by any two of those three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
      # Else if MapMan4_Match string contains "match based on" and current method name:
      # is_candidate = TRUE
      # new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
      # Else:
      # is_candidate = FALSE


special_methods = c("OrthoDB", "RBH", "FastOMA")

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {
  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            3883            1578            2305
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                4234                2305                1038                 583 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               20146                 722                3883                 692 
##         RBH_MapMan4              reject 
##                1578               31847
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

6.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          31920           2377            884
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1        9237      838            497
##   2        4767      764            138
##   3        4396      319             80
##   4        6401      242             82
##   5        7119      214             87
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20087
length(unique(kept$to_geneID))
## [1] 20115
range(kept$from_count)
## [1]  1 51
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 5
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 11
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

6.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 118                  50                  53                  25 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 843                  42                  66                  19 
##         RBH_MapMan4              reject 
##                  21                1168
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'pavi'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'prunus_avium'
  , # change name - compara # sources
  plantName3 = 'wildcherry'
  ,  # change name - MCScanX # sources
  plantName4 = 'pavi'
  ,  # change name - FastOMA # sources
  
  plantDirIn = "pavi_wildCherry"
  , # inconsistent-IDs, orthofinder
  plantNameOut = "wildcherry"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "wildcherry")
  ,

  pattern_in = "-[^-]*$"
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  compara_pattern_in1 = "^(Pav_(sc|co)\\d+\\.\\d+_g\\d+\\.\\d+\\.(br|mk)).*" # instead of ids has some strange concatenation
  ,
  compara_pattern_out1 = "\\1"
  ,
  compara_pattern_in2 = ''
  ,
  compara_pattern_out2 = ''
  ,
  plaza_pattern_in1 = ""
  ,
  plaza_pattern_in2 = ""
  ,
  
  ref_genome = "Prunus_avium_Tieton.proteins"
  , # inconsistent-IDs, orthofinder for OrthoDB
  
  mercator = 'pavi_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "Fun"
  , # plant-gmm
  mercatorPatternOut2 = "FUN" # plant-gmm
  ,
  flag1 = 2
  ,
  flag2 = 1
  ,
  flag3 = TRUE # compara$Ortholog contains mrna space gene 
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000024f763469d8>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-112] | |….. | 9% | |…… | 12% [unnamed-chunk-113] | |…….. | 15% | |……… | 18% [unnamed-chunk-114] | |……….. | 21% | |………… | 24% [unnamed-chunk-115] | |………….. | 27% | |…………… | 30% [unnamed-chunk-116] | |…………….. | 33% | |……………… | 36% [unnamed-chunk-117] | |……………….. | 39% | |………………… | 42% [unnamed-chunk-118] | |………………….. | 45% | |…………………… | 48% [unnamed-chunk-119] | |…………………….. | 52% | |……………………… | 55% [unnamed-chunk-120] | |……………………….. | 58% | |………………………… | 61% [unnamed-chunk-121] | |………………………….. | 64% | |…………………………… | 67% [unnamed-chunk-122] | |…………………………….. | 70% | |……………………………… | 73% [unnamed-chunk-123] | |……………………………….. | 76% | |………………………………… | 79% [unnamed-chunk-124] | |………………………………….. | 82% | |…………………………………… | 85% [unnamed-chunk-125] | |…………………………………….. | 88% | |……………………………………… | 91% [unnamed-chunk-126] | |……………………………………….. | 94% | |………………………………………… | 97% [unnamed-chunk-127] | |…………………………………………..| 100%

cat(child_content)

7 Subsection: pavi

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

7.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##            4504           48941           39725           38228           25594
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID                 to_protID            source
##    <chr>       <chr>       <chr>                     <chr>                <chr> 
##  1 <NA>        AT1G12040.1 <NA>                      FUN_000050-T1        FastO…
##  2 <NA>        AT1G62440.1 <NA>                      FUN_000050-T1        FastO…
##  3 <NA>        AT2G43840.2 <NA>                      FUN_040335-T1        FastO…
##  4 <NA>        AT2G44050.1 <NA>                      FUN_040336-T1        FastO…
##  5 <NA>        AT5G58130.1 <NA>                      FUN_000052-T1        MCSca…
##  6 <NA>        AT5G58130.1 <NA>                      FUN_000053-T1        MCSca…
##  7 <NA>        AT2G43840.2 <NA>                      FUN_040335-T1        MCSca…
##  8 <NA>        AT2G44050.1 <NA>                      FUN_040336-T1        MCSca…
##  9 AT4G39370   <NA>        FUN_020728                <NA>                 Ortho…
## 10 AT3G06350   <NA>        FUN_020749                <NA>                 Ortho…
## 11 AT4G24220   <NA>        FUN_029917                <NA>                 Ortho…
## 12 AT4G24220   <NA>        FUN_029968                <NA>                 Ortho…
## 13 AT1G01030   <NA>        FUN_025493                <NA>                 RBH   
## 14 AT1G01040   <NA>        FUN_011748                <NA>                 RBH   
## 15 ATMG01250   <NA>        FUN_040221                <NA>                 RBH   
## 16 ATMG01360   <NA>        FUN_026804                <NA>                 RBH   
## 17 AT1G01210   AT1G01210.3 Pav_sc0000586.1_g580.1.mk Pav_sc0000586.1_g58… ensem…
## 18 AT1G01225   AT1G01225.1 Pav_sc0000586.1_g550.1.mk Pav_sc0000586.1_g55… ensem…
## 19 ATCG00710   ATCG00710.1 Pav_sc0000216.1_g900.1.mk Pav_sc0000216.1_g90… ensem…
## 20 ATMG00310   ATMG00310.1 Pav_sc0001554.1_g020.1.br Pav_sc0001554.1_g02… ensem…

7.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##       5
print(df[ind, ])
##        from_geneID     from_protID to_geneID     to_protID  source
##             <char>          <char>    <char>        <char>  <char>
## 1: AT4G19110.uORF1 AT4G19110.uORF1      <NA> FUN_023847-T1 MCScanX
## 2: AT5G45430.uORF1 AT5G45430.uORF1      <NA> FUN_023847-T1 MCScanX
## 3: AT5G09460.uORF1 AT5G09460.uORF1      <NA> FUN_028837-T1 MCScanX
## 4: AT5G64340.uORF1 AT5G64340.uORF1      <NA> FUN_028837-T1 MCScanX
## 5: AT1G29950.uORF2 AT1G29950.uORF2      <NA> FUN_032407-T1 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID                 to_protID            source
##    <chr>       <chr>       <chr>                     <chr>                <chr> 
##  1 AT1G12040   AT1G12040.1 FUN_000050                FUN_000050-T1        FastO…
##  2 AT1G62440   AT1G62440.1 FUN_000050                FUN_000050-T1        FastO…
##  3 AT2G43840   AT2G43840.2 FUN_040335                FUN_040335-T1        FastO…
##  4 AT2G44050   AT2G44050.1 FUN_040336                FUN_040336-T1        FastO…
##  5 AT5G58130   AT5G58130.1 FUN_000052                FUN_000052-T1        MCSca…
##  6 AT5G58130   AT5G58130.1 FUN_000053                FUN_000053-T1        MCSca…
##  7 AT2G43840   AT2G43840.2 FUN_040335                FUN_040335-T1        MCSca…
##  8 AT2G44050   AT2G44050.1 FUN_040336                FUN_040336-T1        MCSca…
##  9 AT4G39370   <NA>        FUN_020728                <NA>                 Ortho…
## 10 AT3G06350   <NA>        FUN_020749                <NA>                 Ortho…
## 11 AT4G24220   <NA>        FUN_029917                <NA>                 Ortho…
## 12 AT4G24220   <NA>        FUN_029968                <NA>                 Ortho…
## 13 AT1G01030   <NA>        FUN_025493                <NA>                 RBH   
## 14 AT1G01040   <NA>        FUN_011748                <NA>                 RBH   
## 15 ATMG01250   <NA>        FUN_040221                <NA>                 RBH   
## 16 ATMG01360   <NA>        FUN_026804                <NA>                 RBH   
## 17 AT1G01210   AT1G01210.3 Pav_sc0000586.1_g580.1.mk Pav_sc0000586.1_g58… ensem…
## 18 AT1G01225   AT1G01225.1 Pav_sc0000586.1_g550.1.mk Pav_sc0000586.1_g55… ensem…
## 19 ATCG00710   ATCG00710.1 Pav_sc0000216.1_g900.1.mk Pav_sc0000216.1_g90… ensem…
## 20 ATMG00310   ATMG00310.1 Pav_sc0001554.1_g020.1.br Pav_sc0001554.1_g02… ensem…
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        38228
## 5:             RBH            0        25594

7.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
fl = list.files(fp)
fn = fl[grep('Compara_', fl)] # change filename
compara = data.table::fread(file.path(fp, fn))
fn = fl[grep('PLAZA_', fl)] # change filename
if (length(fn) != 0) {
  plaza = data.table::fread(file.path(fp, fn))
} else {
  plaza = data.frame(matrix(ncol = 4, nrow = 0))
}


compara = compara[compara$Species == ref_genome, ] # change name
plaza = plaza[plaza$Species == ref_genome, ] # change name


colnames(compara)[3] = colnames(plaza)[3] = 'source'


compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
result = compara[, {
  # Cartesian join of OrthoDB_list and Orthologs_list for this row
  pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
  setnames(pairs, c("OrthoDB_ID", "Ortholog"))
  pairs
}, by = seq_len(nrow(compara))]
compara = result[, seq_len := NULL]
# compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                         sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
compara = compara[!duplicated(compara), ]
head(compara)
##                   OrthoDB_ID                 Ortholog
##                       <char>                   <char>
## 1: Pav_sc0007128.1_g050.1.br FUN_003782-T1 FUN_003782
## 2: Pav_sc0011022.1_g010.1.br FUN_040221-T1 FUN_040221
## 3: Pav_sc0001987.1_g160.1.br FUN_040221-T1 FUN_040221
## 4: Pav_sc0001114.1_g300.1.br FUN_040221-T1 FUN_040221
## 5: Pav_co4062149.1_g010.1.br FUN_040221-T1 FUN_040221
## 6: Pav_sc0001576.1_g400.1.br FUN_040221-T1 FUN_040221
if (nrow(plaza) != 0) {
  plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
  plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
  result = plaza[, {
    # Cartesian join of OrthoDB_list and Orthologs_list for this row
    pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
    setnames(pairs, c("OrthoDB_ID", "Ortholog"))
    pairs
  }, by = seq_len(nrow(plaza))]
  plaza = result[, seq_len := NULL]
  # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
  plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
  plaza = plaza[!duplicated(plaza), ]
  head(plaza)  
}

if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog)

if (flag2 == 1) { # geneID and prot ID are completely different # make flags
  df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)  
} else {
    df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
    dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog) 
}

df_compara = df_compara[!is.na(df_compara$to_geneID), ]


if (nrow(plaza) != 0) {
  df_plaza = dplyr::filter(df, source == "PLAZA") %>%
    dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
    dplyr::mutate(to_geneID = Ortholog) %>%
    dplyr::select(-Ortholog)
  df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
}

if (nrow(plaza) != 0) {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
  dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
} else {
  df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
  dt = dplyr::bind_rows(df_compara, df_other)
}

ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
df = dt[, ..ind]
df = df[!duplicated(df), ]


if (nrow(plaza) != 0) {
  ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
} else {
  ind = which(df$source %in% c('ensembl-compara'))
  df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
}





df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 3
##    from_geneID to_geneID  source         
##    <chr>       <chr>      <chr>          
##  1 AT1G12040   FUN_000050 FastOMA        
##  2 AT1G62440   FUN_000050 FastOMA        
##  3 AT2G43840   FUN_040335 FastOMA        
##  4 AT2G44050   FUN_040336 FastOMA        
##  5 AT5G58130   FUN_000052 MCScanX        
##  6 AT5G58130   FUN_000053 MCScanX        
##  7 AT2G43840   FUN_040335 MCScanX        
##  8 AT2G44050   FUN_040336 MCScanX        
##  9 AT4G39370   FUN_020728 OrthoDB        
## 10 AT3G06350   FUN_020749 OrthoDB        
## 11 AT4G24220   FUN_029917 OrthoDB        
## 12 AT4G24220   FUN_029968 OrthoDB        
## 13 AT1G01030   FUN_025493 RBH            
## 14 AT1G01040   FUN_011748 RBH            
## 15 ATMG01250   FUN_040221 RBH            
## 16 ATMG01360   FUN_026804 RBH            
## 17 AT1G01210   FUN_011652 ensembl-compara
## 18 AT1G01225   FUN_011648 ensembl-compara
## 19 AT5G67620   FUN_021483 ensembl-compara
## 20 ATMG00310   FUN_030126 ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  2611684 139.5    7524444  401.9  11756942  627.9
## Vcells 50980318 389.0  152524140 1163.7 190628107 1454.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

7.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22172
length(unique(dt$to_geneID))
## [1] 21950
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##            4367           45924           19709           38228           25594
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

7.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

7.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##        from_geneID  to_geneID FastOMA MCScanX OrthoDB    RBH ensembl-compara
##             <char>     <char>  <lgcl>  <lgcl>  <lgcl> <lgcl>          <lgcl>
## 1: AT1G29950.uORF2 FUN_032407   FALSE    TRUE   FALSE  FALSE           FALSE
## 2: AT4G19110.uORF1 FUN_023847   FALSE    TRUE   FALSE  FALSE           FALSE
## 3: AT5G09460.uORF1 FUN_028837   FALSE    TRUE   FALSE  FALSE           FALSE
## 4: AT5G45430.uORF1 FUN_023847   FALSE    TRUE   FALSE  FALSE           FALSE
## 5: AT5G64340.uORF1 FUN_028837   FALSE    TRUE   FALSE  FALSE           FALSE
##    count_evidence
##             <num>
## 1:              1
## 2:              1
## 3:              1
## 4:              1
## 5:              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

7.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

7.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

7.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 66 75 6e 5f 30 31 33 33 39 34 2d 74 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 66 75 6e 5f 30 31 33 33 39 34 2d 74 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 16470 23868
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "ensembl-compara" "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "athName"         "athSynonims"     "all_pathways"   
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE  TRUE 
## 76691     2
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3216619 171.8    7524444  401.9  11756942  627.9
## Vcells 45501716 347.2  152524140 1163.7 190628107 1454.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

7.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

7.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22167
length(unique(df$to_geneID))
## [1] 21948
range(df$from_count)
## [1]   1 122
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 242
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 131
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          57035          12115           2872
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       28882    10077           2401
##   2        9643     1295            248
##   3        8304      454            119
##   4        8447      244             84
##   5        1759       45             20
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")



# Initialize an empty set or list covered_genes.
# 
# For each method in the list:
# ["MCScanX", "ensembl-compara", "PLAZA", "OrthoDB", "RBH", "FastOMA"]
# 
# For every row in the data table dt:
# 
# a. Check if filter_criteria is "reject" for the row.
# 
# b. Check if the value of the column corresponding to method in this row is TRUE.
# 
# c. Check if from_geneID in this row is not in covered_genes.
# 
# d. If method is one of ["OrthoDB", "RBH", "FastOMA"]:
      # Check if gene pair is covered by all three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = "OrthoDB_FastOMA_RBH"
      # Else if gene pair is covered by any two of those three methods:
      # If yes:
      # is_candidate = TRUE
      # new_criteria = join the two method names with underscore, e.g. "OrthoDB_FastOMA" or "RBH_FastOMA" or "OrthoDB_RBH"
      # Else if MapMan4_Match string contains "match based on" and current method name:
      # is_candidate = TRUE
      # new_criteria = {method}_MapMan4 (e.g., "RBH_MapMan4")
      # Else:
      # is_candidate = FALSE


special_methods = c("OrthoDB", "RBH", "FastOMA")

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {
  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            5081            1779            3127
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                1319                3127                2137                 893 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               19814                3065                5081                1317 
##         RBH_MapMan4              reject 
##                1779               33490
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

7.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          34942           2583           1007
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       11045      883            634
##   2        6524     1002            163
##   3        7269      411            106
##   4        8345      242             84
##   5        1759       45             20
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 19864
length(unique(kept$to_geneID))
## [1] 20842
range(kept$from_count)
## [1]  1 59
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 11
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 15
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

7.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                  49                 113                  79                  36 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 795                  84                 109                  61 
##         RBH_MapMan4              reject 
##                  28                1414
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
# Step 1: params_list
# params_list <- list(
# ...
# )
# 
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
#   plantName1: NULL
#   plantName2: NULL
#   plantName3: NULL
#   plantName4: NULL
#   plantDirIn: NULL
#   plantNameOut: NULL
#   plantDirOut: NULL
#   pattern_in: NULL
#   pattern_out: NULL
#   compara_pattern_in1: NULL
#   compara_pattern_in2: NULL
#   plaza_pattern_in1: NULL
#   plaza_pattern_in2: NULL
#   ref_genome: NULL
#   mercator: NULL
#   mercatorPatternIn1: NULL
#   mercatorPatternOut1: NULL
#   mercatorPatternIn2: NULL
#   mercatorPatternOut2: NULL
# ---
# 
# 
# access params in the script like:
# params$plantName1
# params$plantDirOut
# 
# Step 3: Call render() like
# rmarkdown::render(
#   input = "09_fruitTrees-child.Rmd",
#   params = params_list,
#   envir = new.env(parent = globalenv()),  # optional: isolate execution
#   output_format = "html_document",
#   quiet = FALSE
# )
# 
# 
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory

8 SessionInfo

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## time zone: Europe/Ljubljana
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_3.5.2      knitr_1.50         data.table_1.17.0 
## [5] magrittr_2.0.3    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3       dplyr_1.1.4       
##  [5] compiler_4.4.1     zip_2.3.2          Rcpp_1.0.14        tidyselect_1.2.1  
##  [9] stringr_1.5.1      dichromat_2.0-0.1  jquerylib_0.1.4    textshaping_1.0.1 
## [13] systemfonts_1.2.3  scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
## [17] R6_2.6.1           labeling_0.4.3     patchwork_1.3.0    generics_0.1.4    
## [21] igraph_2.1.4       openxlsx_4.2.8     tibble_3.2.1       bslib_0.9.0       
## [25] pillar_1.10.2      RColorBrewer_1.1-3 rlang_1.1.5        utf8_1.2.5        
## [29] cachem_1.1.0       stringi_1.8.7      xfun_0.52          sass_0.4.10       
## [33] cli_3.6.3          withr_3.0.2        digest_0.6.37      grid_4.4.1        
## [37] rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.3    
## [41] glue_1.8.0         farver_2.1.2       ragg_1.4.0         colorspace_2.1-1  
## [45] rmarkdown_2.29     tools_4.4.1        pkgconfig_2.0.3    htmltools_0.5.8.1